Load Libraries

library(tidyverse)
library(GGally)
library(cowplot)
library(class)
library(caret)
library(e1071)
library(reshape2)
library(stringr)

Load and look at the data

attrition.df <- read.csv("CaseStudy2-data.csv", header = T)
str(attrition.df)
## 'data.frame':    870 obs. of  36 variables:
##  $ ID                      : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Age                     : int  32 40 35 32 24 27 41 37 34 34 ...
##  $ Attrition               : chr  "No" "No" "No" "No" ...
##  $ BusinessTravel          : chr  "Travel_Rarely" "Travel_Rarely" "Travel_Frequently" "Travel_Rarely" ...
##  $ DailyRate               : int  117 1308 200 801 567 294 1283 309 1333 653 ...
##  $ Department              : chr  "Sales" "Research & Development" "Research & Development" "Sales" ...
##  $ DistanceFromHome        : int  13 14 18 1 2 10 5 10 10 10 ...
##  $ Education               : int  4 3 2 4 1 2 5 4 4 4 ...
##  $ EducationField          : chr  "Life Sciences" "Medical" "Life Sciences" "Marketing" ...
##  $ EmployeeCount           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ EmployeeNumber          : int  859 1128 1412 2016 1646 733 1448 1105 1055 1597 ...
##  $ EnvironmentSatisfaction : int  2 3 3 3 1 4 2 4 3 4 ...
##  $ Gender                  : chr  "Male" "Male" "Male" "Female" ...
##  $ HourlyRate              : int  73 44 60 48 32 32 90 88 87 92 ...
##  $ JobInvolvement          : int  3 2 3 3 3 3 4 2 3 2 ...
##  $ JobLevel                : int  2 5 3 3 1 3 1 2 1 2 ...
##  $ JobRole                 : chr  "Sales Executive" "Research Director" "Manufacturing Director" "Sales Executive" ...
##  $ JobSatisfaction         : int  4 3 4 4 4 1 3 4 3 3 ...
##  $ MaritalStatus           : chr  "Divorced" "Single" "Single" "Married" ...
##  $ MonthlyIncome           : int  4403 19626 9362 10422 3760 8793 2127 6694 2220 5063 ...
##  $ MonthlyRate             : int  9250 17544 19944 24032 17218 4809 5561 24223 18410 15332 ...
##  $ NumCompaniesWorked      : int  2 1 2 1 1 1 2 2 1 1 ...
##  $ Over18                  : chr  "Y" "Y" "Y" "Y" ...
##  $ OverTime                : chr  "No" "No" "No" "No" ...
##  $ PercentSalaryHike       : int  11 14 11 19 13 21 12 14 19 14 ...
##  $ PerformanceRating       : int  3 3 3 3 3 4 3 3 3 3 ...
##  $ RelationshipSatisfaction: int  3 1 3 3 3 3 1 3 4 2 ...
##  $ StandardHours           : int  80 80 80 80 80 80 80 80 80 80 ...
##  $ StockOptionLevel        : int  1 0 0 2 0 2 0 3 1 1 ...
##  $ TotalWorkingYears       : int  8 21 10 14 6 9 7 8 1 8 ...
##  $ TrainingTimesLastYear   : int  3 2 2 3 2 4 5 5 2 3 ...
##  $ WorkLifeBalance         : int  2 4 3 3 3 2 2 3 3 2 ...
##  $ YearsAtCompany          : int  5 20 2 14 6 9 4 1 1 8 ...
##  $ YearsInCurrentRole      : int  2 7 2 10 3 7 2 0 1 2 ...
##  $ YearsSinceLastPromotion : int  0 4 2 5 1 1 0 0 0 7 ...
##  $ YearsWithCurrManager    : int  3 9 2 7 3 7 3 0 0 7 ...
noSalary <- read.csv("CaseStudy2CompSet(NoSalary).csv", header = T)
str(noSalary)
## 'data.frame':    300 obs. of  35 variables:
##  $ ID                      : int  871 872 873 874 875 876 877 878 879 880 ...
##  $ Age                     : int  43 33 55 36 27 39 33 21 30 51 ...
##  $ Attrition               : chr  "No" "No" "Yes" "No" ...
##  $ BusinessTravel          : chr  "Travel_Frequently" "Travel_Rarely" "Travel_Rarely" "Non-Travel" ...
##  $ DailyRate               : int  1422 461 267 1351 1302 895 750 251 1312 1405 ...
##  $ Department              : chr  "Sales" "Research & Development" "Sales" "Research & Development" ...
##  $ DistanceFromHome        : int  2 13 13 9 19 5 22 10 23 11 ...
##  $ Education               : int  4 1 4 4 3 3 2 2 3 2 ...
##  $ EducationField          : chr  "Life Sciences" "Life Sciences" "Marketing" "Life Sciences" ...
##  $ EmployeeCount           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ EmployeeNumber          : int  1849 995 1372 1949 1619 42 160 1279 159 1367 ...
##  $ EnvironmentSatisfaction : int  1 2 1 1 4 4 3 1 1 4 ...
##  $ Gender                  : chr  "Male" "Female" "Male" "Male" ...
##  $ HourlyRate              : int  92 53 85 66 67 56 95 45 96 82 ...
##  $ JobInvolvement          : int  3 3 4 4 2 3 3 2 1 2 ...
##  $ JobLevel                : int  2 1 4 1 1 2 2 1 1 4 ...
##  $ JobRole                 : chr  "Sales Executive" "Research Scientist" "Sales Executive" "Laboratory Technician" ...
##  $ JobSatisfaction         : int  4 4 3 2 1 4 2 3 3 2 ...
##  $ MaritalStatus           : chr  "Married" "Single" "Single" "Married" ...
##  $ MonthlyRate             : int  19246 17241 9277 9238 16290 3335 15480 25308 22310 24439 ...
##  $ NumCompaniesWorked      : int  1 3 6 1 1 3 0 1 1 3 ...
##  $ Over18                  : chr  "Y" "Y" "Y" "Y" ...
##  $ OverTime                : chr  "No" "No" "Yes" "No" ...
##  $ PercentSalaryHike       : int  20 18 17 22 11 14 13 20 25 16 ...
##  $ PerformanceRating       : int  4 3 3 4 3 3 3 4 4 3 ...
##  $ RelationshipSatisfaction: int  3 1 3 2 1 3 1 3 3 2 ...
##  $ StandardHours           : int  80 80 80 80 80 80 80 80 80 80 ...
##  $ StockOptionLevel        : int  1 0 0 0 2 1 1 0 3 0 ...
##  $ TotalWorkingYears       : int  7 5 24 5 7 19 8 2 10 29 ...
##  $ TrainingTimesLastYear   : int  5 4 2 3 3 6 2 2 2 1 ...
##  $ WorkLifeBalance         : int  3 3 2 3 3 4 4 1 2 2 ...
##  $ YearsAtCompany          : int  7 3 19 5 7 1 7 2 10 5 ...
##  $ YearsInCurrentRole      : int  7 2 7 4 7 0 7 2 7 2 ...
##  $ YearsSinceLastPromotion : int  7 0 3 0 0 0 0 2 0 0 ...
##  $ YearsWithCurrManager    : int  7 2 8 2 7 0 7 2 9 3 ...
Rcolors <- c("aquamarine3", "coral1", "plum4")

Attrition EDA

Attrition Dependent on Marital Status

# divorce contribute to attrition
bar <- ggplot(attrition.df, aes(x=MaritalStatus, fill = MaritalStatus)) +
  geom_bar() +
  facet_wrap(~Attrition) +
  ggtitle("Attrition Dependent on Marital Status ") +
  xlab("Attrition") +
  ylab("Count")
bar <- bar + scale_fill_manual(values = Rcolors)

# Yes Attrition
# make new data frames for married, divorced, and single
df1 <- attrition.df %>%
  filter(Attrition == "Yes")
df1length <- NROW(df1)
df1married <- length(grep("Married", df1$MaritalStatus))
df1divorced <- length(grep("Divorced", df1$MaritalStatus))
df1single <- length(grep("Single", df1$MaritalStatus))

Yes_df <- data.frame(MaritalStatus = c("Married","Divorced","Single"), 
                     Value = c(df1married,df1divorced,df1single))

plot1 <- ggplot(Yes_df, aes(x="", y=Value, fill=MaritalStatus)) +
  geom_bar(width = 1, stat = "identity") + xlab("") + ylab("") + ggtitle("Attrition: Yes")
pie1 <- plot1 + coord_polar("y", start=0)
pie1 <- pie1 + scale_fill_manual(values = Rcolors)

# No Attrition
# make new data frames for married, divorced, and single
df2 <- attrition.df %>%
  filter(Attrition == "No")
df2length <- NROW(df2)
df2married <- length(grep("Married", df2$MaritalStatus))
df2divorced <- length(grep("Divorced", df2$MaritalStatus))
df2single <- length(grep("Single", df2$MaritalStatus))

No_df <- data.frame(MaritalStatus = c("Married","Divorced","Single"), 
                     Value = c(df2married,df2divorced,df2single))

plot2 <- ggplot(No_df, aes(x="", y=Value, fill=MaritalStatus)) +
  geom_bar(width = 1, stat = "identity") + xlab("") + ylab("") + ggtitle("Attrition: No")
pie2 <- plot2 + coord_polar("y", start=0)
pie2 <- pie2 + scale_fill_manual(values = Rcolors)

piecharts <- plot_grid(pie2,pie1, ncol = 2,  labels = c("B","C"))
allplots <- plot_grid(bar,piecharts, nrow = 2, labels = "A")
allplots

Monthly Income

ggplot(attrition.df, aes(x=MonthlyIncome, fill = Attrition)) +
  geom_histogram() +
  ggtitle("Monthly Income Based on Attrition") +
  xlab("Monthly Income") +
  ylab("Count") +
  scale_fill_manual(values = Rcolors)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Monthly Income Based on Job Role

ggplot(attrition.df, aes(x=MonthlyIncome, fill = Attrition)) +
  geom_histogram() +
  facet_wrap(~JobRole) +
  ggtitle("Monthly Income Based on Job Role") +
  xlab("Monthly Income") +
  ylab("Count") +
  scale_fill_manual(values = Rcolors)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Percentage of Attrition Based on Multiple Variables

EDA1 <- ggplot(attrition.df, aes(x=RelationshipSatisfaction, fill = Attrition)) +
  geom_bar(position = "fill") +
  #facet_wrap(~JobRole) +
  ggtitle("Relationship Satisfaction") +
  ylab("% Count") + 
  scale_y_continuous(labels = scales::percent) + 
  theme(legend.position = "none") +
  scale_fill_manual(values = Rcolors)

EDA2 <- ggplot(attrition.df, aes(x=JobLevel, fill = Attrition)) +
  geom_bar(position = "fill") +
  #facet_wrap(~JobRole) +
  ggtitle("Job Level") +
  ylab("% Count") + 
  scale_y_continuous(labels = scales::percent) + 
  theme(legend.position = "none") +
  scale_fill_manual(values = Rcolors)

EDA3 <- ggplot(attrition.df, aes(x=JobSatisfaction, fill = Attrition)) +
  geom_bar(position = "fill") +
  ggtitle("Job Satisfaction") +
  ylab("% Count") + 
  scale_y_continuous(labels = scales::percent) + 
  theme(legend.position = "none") +
  scale_fill_manual(values = Rcolors)

EDA4 <- ggplot(attrition.df, aes(x=YearsWithCurrManager, fill = Attrition)) +
  geom_bar(position = "fill") +
  ggtitle("Years With Current Manager") +
  ylab("% Count") + 
  scale_y_continuous(labels = scales::percent) + 
  theme(legend.position = "none") +
  scale_fill_manual(values = Rcolors)

EDA5 <- ggplot(attrition.df, aes(x=YearsSinceLastPromotion, fill = Attrition)) +
  geom_bar(position = "fill") +
  ggtitle("Years Since Last Promotion") +
  ylab("% Count") + 
  scale_y_continuous(labels = scales::percent) + 
  theme(legend.position = "none") +
  scale_fill_manual(values = Rcolors)

EDA6 <- ggplot(attrition.df, aes(x=YearsAtCompany, fill = Attrition)) +
  geom_bar(position = "fill") +
  ggtitle("Years At Company") +
  ylab("% Count") + 
  scale_y_continuous(labels = scales::percent) + 
  theme(legend.position = "none") +
  scale_fill_manual(values = Rcolors)

EDA7 <- ggplot(attrition.df, aes(x=YearsInCurrentRole, fill = Attrition)) +
  geom_bar(position = "fill") +
  ggtitle("Years in Current Role") +
  ylab("% Count") + 
  scale_y_continuous(labels = scales::percent) + 
  theme(legend.position = "none") +
  scale_fill_manual(values = Rcolors)

EDA8 <- ggplot(attrition.df, aes(x=NumCompaniesWorked, fill = Attrition)) +
  geom_bar(position = "fill") +
  ggtitle("Companies Worked") +
  ylab("% Count") + 
  scale_y_continuous(labels = scales::percent) + 
  theme(legend.position = "none") +
  scale_fill_manual(values = Rcolors)

EDA9 <- ggplot(attrition.df, aes(x=MaritalStatus, fill = Attrition)) +
  geom_bar(position = "fill") +
  ggtitle("Marital Status") +
  ylab("% Count") + 
  scale_y_continuous(labels = scales::percent) + 
  theme(legend.position = "none") +
  scale_fill_manual(values = Rcolors)

EDA10 <- plot_grid(EDA1,EDA2,EDA3,EDA4,EDA5,EDA6,EDA7,EDA8,EDA9, ncol = 3, nrow = 3)
EDA10

Scatter Plots with Continuous Variables

EDA.A <- ggplot(attrition.df, aes(x=TotalWorkingYears, y=PercentSalaryHike, color = Attrition)) +
  geom_point() +
  ggtitle("Total Working Years vs Percent Salary Hike") +
  scale_color_manual(values = Rcolors)

EDA.B <- ggplot(attrition.df, aes(x=YearsWithCurrManager, y=PercentSalaryHike, color = Attrition)) +
  geom_point() +
  ggtitle("Years With Current Manager vs Percent Salary Hike") +
  scale_color_manual(values = Rcolors)

EDA.C <-ggplot(attrition.df, aes(x=YearsAtCompany, y=YearsInCurrentRole, color = Attrition)) +
  geom_point() +
  ggtitle("Years At Company vs Years In CurrentRole") +
  scale_color_manual(values = Rcolors)

EDA.D <- ggplot(attrition.df, aes(x=Age, y=MonthlyIncome, color = Attrition)) +
  geom_point() +
  ggtitle("Age vs Monthly Income") +
  scale_color_manual(values = Rcolors)

EDA.E <- ggplot(attrition.df, aes(x=Age, y=YearsSinceLastPromotion, color = Attrition)) +
  geom_point() +
  ggtitle("Age vs Job YearsSinceLastPromotion") + 
  scale_color_manual(values = Rcolors)

EDA.F <- ggplot(attrition.df, aes(x=Age, y=PercentSalaryHike, color = Attrition)) +
  geom_point() +
  ggtitle("Age vs Percent Salary Hike") +
  scale_color_manual(values = Rcolors)

EDA.G <- plot_grid(EDA.A,EDA.B,EDA.C,EDA.D,EDA.E,EDA.F, ncol = 2, nrow = 3)
EDA.G

GGpairs Plots

ggpairs(attrition.df, columns = c(2,25,35,36,3), aes(color = Attrition))

Attrition Analysis by Job Role

ggplot(attrition.df, aes(x=Age, y=MonthlyIncome, color = Attrition)) +
  geom_point() +
  facet_wrap(~JobRole) +
  scale_color_manual(values = Rcolors)

Models for Attrition

k-NN Model (Train and Test Sets)

Conclusion: This model does not have enough “Yes” attrition data to be able to use. Internal may prove more able to handle this skewed data.

set.seed(9)
splitPerc <- .85
trainIndices <- sample(1:dim(attrition.df)[1],round(splitPerc * dim(attrition.df)[1]))
dfTrain <- attrition.df[trainIndices,]
dfTest <- attrition.df[-trainIndices,]
dfTrain <- na.omit(dfTrain)
dfTest <- na.omit(dfTest)

# 2:Age, 18:JobSatisfaction, 20:MontlyIncome, 25:PercentSalaryHike, 
# 30:TotalWorkingYears, 30:TotalworkingYears, 33:YearsAtCompany, 
# 34:YearsInCurrentRole, 35:YearsSinceLastPromotion, 36:YearsWithCurrManager

# knn model 
classifications <- knn(dfTrain[,c(2,35)], dfTest[,c(2,35)], dfTrain$Attrition, 
                       prob = TRUE, k = 10)
table(dfTest$Attrition,classifications)
##      classifications
##        No Yes
##   No  106   0
##   Yes  22   2
confusionMatrix(table(dfTest$Attrition,classifications))
## Confusion Matrix and Statistics
## 
##      classifications
##        No Yes
##   No  106   0
##   Yes  22   2
##                                           
##                Accuracy : 0.8308          
##                  95% CI : (0.7551, 0.8908)
##     No Information Rate : 0.9846          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1291          
##                                           
##  Mcnemar's Test P-Value : 7.562e-06       
##                                           
##             Sensitivity : 0.82812         
##             Specificity : 1.00000         
##          Pos Pred Value : 1.00000         
##          Neg Pred Value : 0.08333         
##              Prevalence : 0.98462         
##          Detection Rate : 0.81538         
##    Detection Prevalence : 0.81538         
##       Balanced Accuracy : 0.91406         
##                                           
##        'Positive' Class : No              
## 

Loop for many k

# Loop for many k and the average of many training / test partition
iterations = 100
numks = 50

masterAcc = matrix(nrow = iterations, ncol = numks)

for(j in 1:iterations)
{
  accs = data.frame(accuracy = numeric(100), k = numeric(100))
  trainIndices = sample(1:dim(attrition.df)[1],round(splitPerc * dim(attrition.df)[1]))
  train = attrition.df[trainIndices,]
  test = attrition.df[-trainIndices,]
  train = na.omit(train)
  test = na.omit(test)
  for(i in 1:numks)
  {
    classifications = knn(train[,c(2,35)],test[,c(2,35)],train$Attrition, prob = TRUE, k = i)
    table(classifications,test$Attrition)
    CM = confusionMatrix(table(classifications,test$Attrition))
    masterAcc[j,i] = CM$overall[1]
  }
  
}

MeanAcc = colMeans(masterAcc)

plot(seq(1,numks,1),MeanAcc, type = "l", xlab = "k", ylab = "Mean Accuracy", main = "Mean Accuracy of k Values")

Internal k-NN Model

# Internal Model
classifications1 <- knn.cv(dfTrain[,c(2,35)],dfTrain$Attrition, k = 20)
confusionMatrix(table(classifications1,dfTrain$Attrition))
## Confusion Matrix and Statistics
## 
##                 
## classifications1  No Yes
##              No  618 107
##              Yes   6   9
##                                           
##                Accuracy : 0.8473          
##                  95% CI : (0.8193, 0.8725)
##     No Information Rate : 0.8432          
##     P-Value [Acc > NIR] : 0.4044          
##                                           
##                   Kappa : 0.1053          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.99038         
##             Specificity : 0.07759         
##          Pos Pred Value : 0.85241         
##          Neg Pred Value : 0.60000         
##              Prevalence : 0.84324         
##          Detection Rate : 0.83514         
##    Detection Prevalence : 0.97973         
##       Balanced Accuracy : 0.53399         
##                                           
##        'Positive' Class : No              
## 

Naive Bayes

NBmodel <- naiveBayes(dfTrain[,c(24,20)],dfTrain$Attrition,laplace = 1)
table(predict(NBmodel,dfTest[,c(24,20)]),dfTest$Attrition)
##      
##        No Yes
##   No  106  24
##   Yes   0   0
confusionMatrix(table(predict(NBmodel,dfTest[,c(24,20)]),dfTest$Attrition))
## Confusion Matrix and Statistics
## 
##      
##        No Yes
##   No  106  24
##   Yes   0   0
##                                          
##                Accuracy : 0.8154         
##                  95% CI : (0.7379, 0.878)
##     No Information Rate : 0.8154         
##     P-Value [Acc > NIR] : 0.5543         
##                                          
##                   Kappa : 0              
##                                          
##  Mcnemar's Test P-Value : 2.668e-06      
##                                          
##             Sensitivity : 1.0000         
##             Specificity : 0.0000         
##          Pos Pred Value : 0.8154         
##          Neg Pred Value :    NaN         
##              Prevalence : 0.8154         
##          Detection Rate : 0.8154         
##    Detection Prevalence : 1.0000         
##       Balanced Accuracy : 0.5000         
##                                          
##        'Positive' Class : No             
## 
NBmodel1 = naiveBayes(Attrition~.,data = dfTrain)
table(predict(NBmodel1,dfTest))
## 
##  No Yes 
## 102  28
confusionMatrix(table(predict(NBmodel1,dfTest),dfTest$Attrition))
## Confusion Matrix and Statistics
## 
##      
##       No Yes
##   No  96   6
##   Yes 10  18
##                                          
##                Accuracy : 0.8769         
##                  95% CI : (0.8078, 0.928)
##     No Information Rate : 0.8154         
##     P-Value [Acc > NIR] : 0.0401         
##                                          
##                   Kappa : 0.616          
##                                          
##  Mcnemar's Test P-Value : 0.4533         
##                                          
##             Sensitivity : 0.9057         
##             Specificity : 0.7500         
##          Pos Pred Value : 0.9412         
##          Neg Pred Value : 0.6429         
##              Prevalence : 0.8154         
##          Detection Rate : 0.7385         
##    Detection Prevalence : 0.7846         
##       Balanced Accuracy : 0.8278         
##                                          
##        'Positive' Class : No             
## 

New Dataframe for Multiple Varables in Models

# Make new dataframe and make train and test sets
set.seed(9)
new.df <- attrition.df[,c(2,3,4,5,6,7,15,16,18,20,29,30,32,33,34,35)]
splitPerc <- .7
trainIndices <- sample(1:dim(new.df)[1],round(splitPerc * dim(new.df)[1]))
dfTrain1 <- new.df[trainIndices,]
dfTest1 <- new.df[-trainIndices,]
dfTrain1 <- na.omit(dfTrain1)
dfTest1 <- na.omit(dfTest1)

Run Model with New Datframe

# Naive Bayes Model for New Dataframe
new.dfmodel = naiveBayes(Attrition~.,data = dfTrain1)
confusionMatrix(table(predict(new.dfmodel,dfTest1),dfTest1$Attrition))
## Confusion Matrix and Statistics
## 
##      
##        No Yes
##   No  206  20
##   Yes  17  18
##                                           
##                Accuracy : 0.8582          
##                  95% CI : (0.8099, 0.8982)
##     No Information Rate : 0.8544          
##     P-Value [Acc > NIR] : 0.4733          
##                                           
##                   Kappa : 0.4109          
##                                           
##  Mcnemar's Test P-Value : 0.7423          
##                                           
##             Sensitivity : 0.9238          
##             Specificity : 0.4737          
##          Pos Pred Value : 0.9115          
##          Neg Pred Value : 0.5143          
##              Prevalence : 0.8544          
##          Detection Rate : 0.7893          
##    Detection Prevalence : 0.8659          
##       Balanced Accuracy : 0.6987          
##                                           
##        'Positive' Class : No              
## 

Linear Regression

Attrition Linear Regression

Not sure if this is actually correct to use for this kind of data

# change attrition variable to numberic for use in linear regression model
attrition.lm <- attrition.df
attrition.lm$Attrition <- gsub("Yes", 1, attrition.lm$Attrition)
attrition.lm$Attrition <- gsub("No", 0, attrition.lm$Attrition)
attrition.lm$Attrition <- as.numeric(attrition.lm$Attrition)

# run model
attritionfit = lm(Attrition~Age*MonthlyIncome, data = attrition.lm)
attritionfit1 = lm(Attrition~Age+MonthlyIncome, data = attrition.lm)
attritionfit2 = lm(Attrition~MonthlyIncome, data = attrition.lm)
attritionfit3 = lm(Attrition~Age, data = attrition.lm)

# inormation about model
summary(attritionfit)
## 
## Call:
## lm(formula = Attrition ~ Age * MonthlyIncome, data = attrition.lm)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.37829 -0.19156 -0.13330 -0.06808  1.02288 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        6.202e-01  8.883e-02   6.981 5.83e-12 ***
## Age               -1.030e-02  2.353e-03  -4.379 1.34e-05 ***
## MonthlyIncome     -5.761e-05  1.400e-05  -4.114 4.26e-05 ***
## Age:MonthlyIncome  1.130e-06  3.153e-07   3.583 0.000358 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3599 on 866 degrees of freedom
## Multiple R-squared:  0.04538,    Adjusted R-squared:  0.04207 
## F-statistic: 13.72 on 3 and 866 DF,  p-value: 9.485e-09
confint(attritionfit)
##                           2.5 %        97.5 %
## (Intercept)        4.457982e-01  7.945094e-01
## Age               -1.491937e-02 -5.684270e-03
## MonthlyIncome     -8.508807e-05 -3.012455e-05
## Age:MonthlyIncome  5.109013e-07  1.748437e-06
attritionfit$coefficients
##       (Intercept)               Age     MonthlyIncome Age:MonthlyIncome 
##      6.201538e-01     -1.030182e-02     -5.760631e-05      1.129669e-06
hist(attritionfit$residuals, col = "blue", main = "Histogram of Residuals")

sqrt(mean(attritionfit$residuals^2))
## [1] 0.3590231
sum((attritionfit$residuals)^2)
## [1] 112.1409

Train and Test for Linear Regression Attrition

set.seed(9)
splitPerc <- .8
trainIndices <- sample(1:dim(attrition.lm)[1],round(splitPerc * dim(attrition.lm)[1]))
dfTrain2 <- attrition.lm[trainIndices,]
dfTest2 <- attrition.lm[-trainIndices,]
dfTrain2 <- na.omit(dfTrain2)
dfTest2 <- na.omit(dfTest2)
# Best LM model for Attrition
LR.fit <- lm(Attrition~Age*MonthlyIncome, data = dfTrain2)
summary(LR.fit)
## 
## Call:
## lm(formula = Attrition ~ Age * MonthlyIncome, data = dfTrain2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.38770 -0.19212 -0.13098 -0.06281  0.97511 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        6.504e-01  9.709e-02   6.698 4.37e-11 ***
## Age               -1.093e-02  2.578e-03  -4.238 2.56e-05 ***
## MonthlyIncome     -6.789e-05  1.542e-05  -4.403 1.24e-05 ***
## Age:MonthlyIncome  1.351e-06  3.464e-07   3.900 0.000106 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3576 on 692 degrees of freedom
## Multiple R-squared:  0.05128,    Adjusted R-squared:  0.04716 
## F-statistic: 12.47 on 3 and 692 DF,  p-value: 6.013e-08
LR.Preds <- predict(LR.fit, newdata = dfTest2)

# Metrics for model
MSPE <- data.frame(Observed = dfTest2$Attrition, Predicted = LR.Preds)
MSPE$Resisdual <- MSPE$Observed - MSPE$Predicted
MSPE$SquaredResidual <- MSPE$Resisdual^2
mean(MSPE$SquaredResidual)
## [1] 0.1362915
RMSE <- mean((MSPE$Observed - MSPE$Predicted)^2) %>% sqrt()
RMSE
## [1] 0.3691768

Linear Regression for Salary

Change Job Level to Character

attrition.jl <- attrition.df
attrition.jl$JobLevel <- gsub(1, "1", attrition.jl$JobLevel)
attrition.jl$JobLevel <- gsub(2, "2", attrition.jl$JobLevel)
attrition.jl$JobLevel <- gsub(3, "3", attrition.jl$JobLevel)
attrition.jl$JobLevel <- gsub(4, "4", attrition.jl$JobLevel)
attrition.jl$JobLevel <- gsub(5, "5", attrition.jl$JobLevel)
attrition.jl$JobLevel <- as.character(attrition.jl$JobLevel)

# Job level model
JLModel_fit = lm(MonthlyIncome~JobLevel, data = attrition.jl)
summary(JLModel_fit)
## 
## Call:
## lm(formula = MonthlyIncome ~ JobLevel, data = attrition.jl)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4642.2  -668.0  -107.3   668.3  4412.7 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2743.82      69.69   39.37   <2e-16 ***
## JobLevel2    2800.46      99.89   28.04   <2e-16 ***
## JobLevel3    7108.38     130.24   54.58   <2e-16 ***
## JobLevel4   12509.83     177.45   70.50   <2e-16 ***
## JobLevel5   16480.15     219.18   75.19   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1264 on 865 degrees of freedom
## Multiple R-squared:  0.9248, Adjusted R-squared:  0.9244 
## F-statistic:  2658 on 4 and 865 DF,  p-value: < 2.2e-16
preds <- predict(JLModel_fit)
hist(JLModel_fit$residuals, col = "blue", main = "Histogram of Residuals")

plot(JLModel_fit$fitted.values,JLModel_fit$residuals, main = "Plot of Residuals v. Fitted Values")

attrition.jl %>% ggplot(aes(x = JobLevel, y = MonthlyIncome)) + 
  geom_point() + 
  geom_line(data = attrition.jl, aes(x = JobLevel, y = preds, col = "red"))

Predict Using Single Variable

Model1_fit = lm(MonthlyIncome~JobLevel, data = attrition.df)
summary(Model1_fit)
## 
## Call:
## lm(formula = MonthlyIncome ~ JobLevel, data = attrition.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5037.1  -928.2    80.1   697.1  3723.6 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1793.93     101.68  -17.64   <2e-16 ***
## JobLevel     4013.67      43.98   91.26   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1413 on 868 degrees of freedom
## Multiple R-squared:  0.9056, Adjusted R-squared:  0.9055 
## F-statistic:  8329 on 1 and 868 DF,  p-value: < 2.2e-16
hist(Model1_fit$residuals, col = "blue", main = "Histogram of Residuals")

SalaryPredict <- predict(Model1_fit, noSalary, interval = "confidence") 
SalaryPredict <- as.data.frame(SalaryPredict)
noSalary$MonthlyIncome <- SalaryPredict[,1]

#predicted values plotted
noSalary %>% ggplot(aes(x = JobLevel, y = MonthlyIncome)) + 
  geom_point() + 
  geom_line(data = noSalary, aes(x = JobLevel, y = MonthlyIncome, col = "red")) +
  ggtitle("Predicted Values using Single Varialbe Model") +
  scale_color_manual(values = Rcolors)

# distribution of responses
ggplot(noSalary, aes(x=MonthlyIncome)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Predict Using Exponential Variable (Squared)

# Good Exponential Model 
attriton.df2 = attrition.df %>% mutate(JobLevel2 = (JobLevel^2))
fit = lm(MonthlyIncome~JobLevel+JobLevel2, attriton.df2)
summary(fit)
## 
## Call:
## lm(formula = MonthlyIncome ~ JobLevel + JobLevel2, data = attriton.df2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4502.5  -744.2  -131.4   656.1  4177.7 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   299.75     190.65   1.572    0.116    
## JobLevel     1944.17     169.12  11.496   <2e-16 ***
## JobLevel2     397.80      31.56  12.603   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1300 on 867 degrees of freedom
## Multiple R-squared:  0.9202, Adjusted R-squared:   0.92 
## F-statistic:  5001 on 2 and 867 DF,  p-value: < 2.2e-16
preds2 <- predict(fit)
hist(fit$residuals, col = "blue", main = "Histogram of Residuals")

plot(fit$fitted.values,fit$residuals, main = "Plot of Residuals v. Fitted Values")

# show model with predicted values
attriton.df2 %>% ggplot(aes(x = JobLevel, y = MonthlyIncome)) + 
  geom_point() + 
  geom_line(data = attriton.df2, aes(x = JobLevel, y = preds2, col = "red")) +
  scale_color_manual(values = Rcolors)

# use model to predict data with no identifiers 
noSalary <- noSalary %>% mutate(JobLevel2 = (JobLevel^2))
test2 <- predict(fit, noSalary, interval = "confidence") 
test2 <- as.data.frame(test2)

# predicted values plotted
noSalary$Salary <- test2[,1]
noSalary %>% ggplot(aes(x = JobLevel, y = Salary)) + 
  geom_point() + 
  geom_line(data = noSalary, aes(x = JobLevel, y = Salary , col = "red")) +
  scale_color_manual(values = Rcolors)

Predict Using Exponential Variable (Cubed)

# Adding extra exponential variable helped
attriton.df3 = attrition.df %>% mutate(JobLevel2 = (JobLevel^2), 
                                       JobLevel3 = (JobLevel^3))
fit2 = lm(MonthlyIncome~JobLevel+JobLevel2+JobLevel3, attriton.df3)
summary(fit2)
## 
## Call:
## lm(formula = MonthlyIncome ~ JobLevel + JobLevel2 + JobLevel3, 
##     data = attriton.df3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4796.8  -669.9  -108.7   652.2  4456.3 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3012.60     435.69   6.915 9.11e-12 ***
## JobLevel    -2176.07     620.83  -3.505  0.00048 ***
## JobLevel2    2125.20     252.82   8.406  < 2e-16 ***
## JobLevel3    -207.57      30.15  -6.884 1.12e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1267 on 866 degrees of freedom
## Multiple R-squared:  0.9244, Adjusted R-squared:  0.9241 
## F-statistic:  3528 on 3 and 866 DF,  p-value: < 2.2e-16
preds3 <- predict(fit2)
hist(fit2$residuals, col = "blue", main = "Histogram of Residuals")

plot(fit2$fitted.values,fit2$residuals, main = "Plot of Residuals v. Fitted Values")

# plot with predicted values on original dataset
attriton.df3 %>% ggplot(aes(x = JobLevel, y = MonthlyIncome)) + 
  geom_point() + 
  geom_line(data = attriton.df3, aes(x = JobLevel, y = preds3, col = "red"))

# model with exponential variable
noSalary <- noSalary %>% mutate(JobLevel3 = (JobLevel^3))
test3 <- predict(fit2, noSalary, interval = "confidence") 
test3 <- as.data.frame(test3)

noSalary$Salary2 <- test3[,1]

# predicted values plotted
noSalary %>% ggplot(aes(x = JobLevel, y = Salary2)) + 
  geom_point() + 
  geom_line(data = noSalary, aes(x = JobLevel, y = Salary2 , col = "red")) +
  scale_color_manual(values = Rcolors)

Predict Using Exponential Variable (4th Root)

# Adding extra exponential variable hurt
attriton.df4 = attrition.df %>% mutate(JobLevel2 = (JobLevel^2), 
                                       JobLevel3 = (JobLevel^3), 
                                       JobLevel4 = (JobLevel^4))
fit3 = lm(MonthlyIncome~JobLevel+JobLevel2+JobLevel3+JobLevel4, attriton.df4)
summary(fit3)
## 
## Call:
## lm(formula = MonthlyIncome ~ JobLevel + JobLevel2 + JobLevel3 + 
##     JobLevel4, data = attriton.df4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4642.2  -668.0  -107.3   668.3  4412.7 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  -245.90    1597.80  -0.154   0.8777  
## JobLevel     4177.61    3061.29   1.365   0.1727  
## JobLevel2   -1910.41    1920.81  -0.995   0.3202  
## JobLevel3     810.46     481.29   1.684   0.0926 .
## JobLevel4     -87.95      41.50  -2.119   0.0343 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1264 on 865 degrees of freedom
## Multiple R-squared:  0.9248, Adjusted R-squared:  0.9244 
## F-statistic:  2658 on 4 and 865 DF,  p-value: < 2.2e-16
preds3 <- predict(fit3)
hist(fit3$residuals, col = "blue", main = "Histogram of Residuals")

plot(fit3$fitted.values,fit3$residuals, main = "Plot of Residuals v. Fitted Values")

attriton.df4 %>% ggplot(aes(x = JobLevel, y = MonthlyIncome)) + 
  geom_point() + 
  geom_line(data = attriton.df4, aes(x = JobLevel, y = preds3, col = "red")) +
  scale_color_manual(values = Rcolors)

Mulitple Variable Linear Regression

# best model !!!
Model2_fit = lm(MonthlyIncome~Age+JobLevel+JobRole, data = attrition.df)
summary(Model2_fit)
## 
## Call:
## lm(formula = MonthlyIncome ~ Age + JobLevel + JobRole, data = attrition.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3639.3  -671.7   -46.5   634.8  4097.4 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   -564.822    249.203  -2.267  0.02367 *  
## Age                             13.701      4.729   2.898  0.00386 ** 
## JobLevel                      3032.839     69.606  43.571  < 2e-16 ***
## JobRoleHuman Resources        -324.725    255.782  -1.270  0.20459    
## JobRoleLaboratory Technician  -526.524    171.357  -3.073  0.00219 ** 
## JobRoleManager                3968.183    232.376  17.077  < 2e-16 ***
## JobRoleManufacturing Director   85.832    169.605   0.506  0.61294    
## JobRoleResearch Director      3941.604    217.548  18.118  < 2e-16 ***
## JobRoleResearch Scientist     -249.221    171.560  -1.453  0.14668    
## JobRoleSales Executive        -127.854    146.073  -0.875  0.38167    
## JobRoleSales Representative   -404.623    215.498  -1.878  0.06077 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1080 on 859 degrees of freedom
## Multiple R-squared:  0.9455, Adjusted R-squared:  0.9448 
## F-statistic:  1490 on 10 and 859 DF,  p-value: < 2.2e-16
preds4 <- predict(Model2_fit)

hist(Model2_fit$residuals, col = "blue", main = "Histogram of Residuals")

sqrt(sum((Model2_fit$residuals)^2))
## [1] 31647.6
# predict with this model
preds5 <- predict(Model2_fit, noSalary)
preds5<- as.data.frame(preds5)
noSalary$Salary3 <- preds5[,1]

# plot predicted values
noSalary %>% ggplot(aes(x = Age, y = Salary3)) + 
  geom_point() + 
  #geom_line(data = noSalary, aes(x = Age, y = Salary3 , col = "red")) +
  scale_color_manual(values = Rcolors) +
  ggtitle("Predicted Salary by Age") +
  ylab("Salary")

#write table
write.csv(noSalary, file = "Case2PredictionsBoyd_Pandey Salary.csv", row.names=FALSE)
# decent model
Model2_fit = lm(MonthlyIncome~Age+JobLevel+JobRole+JobSatisfaction, data = attrition.df)
summary(Model2_fit)
## 
## Call:
## lm(formula = MonthlyIncome ~ Age + JobLevel + JobRole + JobSatisfaction, 
##     data = attrition.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3630.4  -666.4   -39.3   631.2  4094.3 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   -598.056    267.029  -2.240  0.02537 *  
## Age                             13.700      4.731   2.896  0.00388 ** 
## JobLevel                      3033.178     69.649  43.550  < 2e-16 ***
## JobRoleHuman Resources        -321.176    256.116  -1.254  0.21018    
## JobRoleLaboratory Technician  -524.491    171.544  -3.057  0.00230 ** 
## JobRoleManager                3971.223    232.659  17.069  < 2e-16 ***
## JobRoleManufacturing Director   87.033    169.727   0.513  0.60823    
## JobRoleResearch Director      3945.016    217.881  18.106  < 2e-16 ***
## JobRoleResearch Scientist     -248.490    171.661  -1.448  0.14810    
## JobRoleSales Executive        -126.622    146.191  -0.866  0.38665    
## JobRoleSales Representative   -402.660    215.682  -1.867  0.06225 .  
## JobSatisfaction                 11.475     33.006   0.348  0.72819    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1080 on 858 degrees of freedom
## Multiple R-squared:  0.9455, Adjusted R-squared:  0.9448 
## F-statistic:  1353 on 11 and 858 DF,  p-value: < 2.2e-16
Model2_Preds = predict(Model2_fit, newdata = noSalary)




# Didn't seem to match the values given by the model

# MSPE = data.frame(Observed = dfTest$MonthlyIncome, Predicted = Model2_Preds)
# MSPE$Resisdual = MSPE$Observed - MSPE$Predicted
# MSPE$SquaredResidual = MSPE$Resisdual^2
# MSPE
# mean(MSPE$SquaredResidual)
# RMSE <- mean((MSPE$Observed - MSPE$Predicted)^2) %>% sqrt()
# RMSE